from __future__ import print_function
from gensim.models import KeyedVectors
# 위키 vector 모델 로드
ko_model = KeyedVectors.load_word2vec_format('wiki.ko.vec')
ko_model.save('ko_model')
# 필요없는 메모리 unload
ko_model.init_sims(replace=True)
# words 리스트에 추가
words = []
for word in ko_model.vocab:
words.append(word)
find_similar_to = '사랑'
# '사랑'에 가장 유사한 단어 10개 추출
for similar_word in ko_model.similar_by_word(find_similar_to):
print("Word: {0}, Similarity: {1:.2f}".format(
similar_word[0], similar_word[1]
))
word_add = ['동물', '파충류']
word_sub = ['뱀']
# 네거티브 샘플링
for resultant_word in ko_model.most_similar(
positive=word_add, negative=word_sub
):
print("Word : {0} , Similarity: {1:.2f}".format(
resultant_word[0], resultant_word[1]
))
similarities = ko_model.most_similar(positive=['동물', '파충류'], negative=['뱀'])
print(similarities)
# 유사도가 떨어지는 단어 찾기
not_matching = ko_model.doesnt_match("아침 점심 저녁 된장국".split())
print(not_matching)
# 유사도 점수
sim_score = ko_model.similarity('컴퓨터', '인간')
print(sim_score)
sim_score = ko_model.similarity('로봇', '인간')
print(sim_score)
sim_score = ko_model.similarity('사랑해', '사랑의')
print(sim_score)
# 가장 유사한 단어 10개 추출
print(ko_model.most_similar('전자'))
from gensim.models import FastText
ft2 = FastText.load_fasttext_format('wiki.ko.bin')
ko_model.save('ft2')
ft2.init_sims(replace=True)
find_similar_to = '사랑'
for similar_word in ft2.similar_by_word(find_similar_to):
print("Word: {0}, Similarity: {1:.2f}".format(
similar_word[0], similar_word[1]
))
word_add = ['동물', '파충류']
word_sub = ['뱀']
for resultant_word in ft2.wv.most_similar(
positive=word_add, negative=word_sub
):
print("Word : {0} , Similarity: {1:.2f}".format(
resultant_word[0], resultant_word[1]
))
similarities = ft2.wv.most_similar(positive=['동물', '파충류'], negative=['뱀'])
print(similarities)
not_matching = ft2.wv.doesnt_match("아침 점심 저녁 된장국".split())
print(not_matching)
sim_score = ft2.wv.similarity('컴퓨터', '인간')
print(sim_score)
sim_score = ft2.wv.similarity('로봇', '인간')
print(sim_score)
sim_score = ft2.wv.similarity('사랑해', '사랑의')
print(sim_score)
print(ft2.wv.most_similar('전자'))
import gensim
ko_w2v = gensim.models.Word2Vec.load('ko.bin')
ko_w2v.init_sims(replace=True)
similarities_wv = ko_w2v.wv.most_similar(positive=['동물', '파충류'], negative=['뱀'])
print(similarities_wv)
sim_score_wv = ko_w2v.wv.similarity('컴퓨터', '인간')
print(sim_score_wv)
sim_score_wv = ko_w2v.wv.similarity('로봇', '인간')
print(sim_score_wv)
print(ko_w2v.wv.most_similar(positive=["전자"], topn=10))
from soynlp.utils import DoublespaceLineCorpus
from soynlp.vectorizer import sent_to_word_contexts_matrix
corpus_path = '2016-10-20_article_all_normed.txt'
corpus = DoublespaceLineCorpus(corpus_path, iter_sent=True)
# text 파일로 매트릭스 생성
x, idx2vocab = sent_to_word_contexts_matrix(
corpus,
windows=3,
min_tf=10,
tokenizer=lambda x:x.split(), # (default) lambda x:x.split(),
dynamic_weight=True,
verbose=True)
print(x.shape)
from glove import Glove
# 모델 생성
glove = Glove(no_components=100, learning_rate=0.05, max_count=30)
glove.fit(x.tocoo(), epochs=5, no_threads=4, verbose=True)
# 단어 사전 만들기
dictionary = {vocab:idx for idx, vocab in enumerate(idx2vocab)}
glove.add_dictionary(dictionary)
print(glove.most_similar('사랑', number=10))
print(glove.most_similar('동물', number=10))
print(glove.most_similar('로봇', number=10))
#from kor2vec import Kor2Vec
#kor2vec = Kor2Vec(embed_size=128)
#kor2vec.train("2016-10-20_article_all_normed.txt")